{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/luke/.local/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from lightgbm import LGBMClassifier\n",
    "import lightgbm as lgb\n",
    "import gc\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('/home/luke/Desktop/kaggle/Home_Credit_Default_Risk')\n",
    "\n",
    "#The Merged_df is a meta dataframe join all the tables together including both Train rows and Test rows.\n",
    "#Shape of test data(48744, 121) \n",
    "#Shape of train data(307511, 122) \n",
    "#Shape of merged_df(356255, 298)\n",
    "\n",
    "merged_df = pd.read_csv('processed_input_data.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "meta_cols = ['SK_ID_CURR']\n",
    "meta_df = merged_df[meta_cols]\n",
    "merged_df.drop(columns=meta_cols, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_dataframe(input_df, encoder_dict=None):\n",
    "    \"\"\" Process a dataframe into a form useable by LightGBM \"\"\"\n",
    "\n",
    "    # Label encode categoricals\n",
    "    print('Label encoding categorical features...')\n",
    "    categorical_feats = input_df.columns[input_df.dtypes == 'object']\n",
    "    for feat in categorical_feats:\n",
    "        encoder = LabelEncoder()\n",
    "        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))\n",
    "    print('Label encoding complete.')\n",
    "\n",
    "    return input_df, categorical_feats.tolist(), encoder_dict\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Label encoding categorical features...\n",
      "Label encoding complete.\n"
     ]
    }
   ],
   "source": [
    "merged_df, categorical_feats, encoder_dict = process_dataframe(input_df=merged_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "non_obj_categoricals = [\n",
    "    'FONDKAPREMONT_MODE', 'HOUR_APPR_PROCESS_START', 'HOUSETYPE_MODE',\n",
    "    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',\n",
    "    'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE',\n",
    "    'ORGANIZATION_TYPE', 'STATUS', 'NAME_CONTRACT_STATUS_CAVG',\n",
    "    'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_CONTRACT_TYPE_BAVG',\n",
    "    'WEEKDAY_APPR_PROCESS_START_BAVG', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', \n",
    "    'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_BAVG', \n",
    "    'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', \n",
    "    'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', \n",
    "    'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'NAME_CONTRACT_STATUS_CCAVG' \n",
    "]\n",
    "categorical_feats = categorical_feats + non_obj_categoricals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_test = pd.read_csv('application_test.csv')\n",
    "application_train = pd.read_csv('application_train.csv')\n",
    "len_train = len(application_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/luke/anaconda3/envs/tf/lib/python3.6/site-packages/pandas/core/frame.py:3697: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  errors=errors)\n",
      "/home/luke/anaconda3/envs/tf/lib/python3.6/site-packages/lightgbm/basic.py:1040: UserWarning: Using categorical_feature in Dataset.\n",
      "  warnings.warn('Using categorical_feature in Dataset.')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[50]\tcv_agg's auc: 0.775093 + 0.00312303\n",
      "[100]\tcv_agg's auc: 0.781779 + 0.0024516\n",
      "[150]\tcv_agg's auc: 0.783134 + 0.00257406\n",
      "[200]\tcv_agg's auc: 0.784272 + 0.00259041\n",
      "[250]\tcv_agg's auc: 0.785275 + 0.00240448\n",
      "[300]\tcv_agg's auc: 0.786504 + 0.0023492\n",
      "[350]\tcv_agg's auc: 0.786826 + 0.00241311\n",
      "[400]\tcv_agg's auc: 0.786985 + 0.00244814\n",
      "[450]\tcv_agg's auc: 0.787372 + 0.00237858\n",
      "Optimum boost rounds = 437\n",
      "Best CV result = 0.787427749715952\n"
     ]
    }
   ],
   "source": [
    "train_df = merged_df[:len_train]\n",
    "test_df = merged_df[len_train:]\n",
    "#del merged_df, app_test_df, bureau_df, bureau_balance_df, credit_card_df, pos_cash_df, prev_app_df\n",
    "gc.collect()\n",
    "\n",
    "\"\"\" Train the model \"\"\"\n",
    "target = train_df.pop('TARGET')\n",
    "test_df.drop(columns='TARGET', inplace=True)\n",
    "\n",
    "lgbm_train = lgb.Dataset(data=train_df,\n",
    "                          label=target,\n",
    "                          categorical_feature=categorical_feats,\n",
    "                          free_raw_data=False)\n",
    "\n",
    "del train_df\n",
    "gc.collect()\n",
    "\n",
    "\n",
    "lgbm_params = {\n",
    "    'boosting': 'dart',\n",
    "    'application': 'binary',\n",
    "    'learning_rate': 0.1,\n",
    "    'min_data_in_leaf': 30,\n",
    "    'num_leaves': 31,\n",
    "    'max_depth': -1,\n",
    "    'feature_fraction': 0.5,\n",
    "    'scale_pos_weight': 2,\n",
    "    'drop_rate': 0.02\n",
    "}\n",
    "\n",
    "cv_results = lgb.cv(train_set=lgbm_train,\n",
    "                     params=lgbm_params,\n",
    "                     nfold=5,\n",
    "                     num_boost_round=600,\n",
    "                     early_stopping_rounds=50,\n",
    "                     verbose_eval=50,\n",
    "                     metrics=['auc'])\n",
    "\n",
    "optimum_boost_rounds = np.argmax(cv_results['auc-mean'])\n",
    "print('Optimum boost rounds = {}'.format(optimum_boost_rounds))\n",
    "print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))\n",
    "\n",
    "clf = lgb.train(train_set=lgbm_train,\n",
    "                 params=lgbm_params,\n",
    "                 num_boost_round=optimum_boost_rounds)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = clf.predict(test_df)\n",
    "out_df = pd.DataFrame({'SK_ID_CURR': meta_df['SK_ID_CURR'][len_train:], 'TARGET': y_pred})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_df.to_csv('submission.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
